2 Diving In

Back to top.

2.1 Libraries and Source Files

library(tidyverse)
# library(dict) # Still not found after installation
library(container) # For Dict class
library(useful) # For simple.impute
library(comprehenr) # For list comprehension
library(GGally)
library(reshape2)
library(gridExtra)
library(gplots)
library(DescTools) # For df summary
library(robustHD) # For df summary
library(caret)
library(effsize) # For Cohen's d

source('tools/wrangle.R')
source('tools/eda.R')
source('tools/engineer.R')
source('tools/split.R')

# SEED = 65466

2.2 Loading Data

Here’s the wrangled training set loaded from objects serialized at the end of wrangle_and_split.Rmd.

val_train_X = readRDS("data/val_train_X_wrangled.rds")
val_train_y = readRDS("data/val_train_y_wrangled.rds")

# Merge for easier analysis.
val_train_Xy = merge(
  x = val_train_X,
  y = val_train_y,
  by = 'Id',
  all = TRUE
)

3 EDA and Engineering

Back to top.

str(val_train_Xy)
## 'data.frame':    715 obs. of  81 variables:
##  $ Id           : chr  "1" "10" "1000" "1002" ...
##  $ MSSubClass   : Factor w/ 16 levels "20","30","40",..: 6 16 1 2 1 9 14 1 5 11 ...
##  $ MSZoning     : Factor w/ 8 levels "A","C","FV","I",..: 6 6 6 6 6 6 8 6 6 6 ...
##  $ LotFrontage  : num  65 50 64 60 75 65 21 NA 115 75 ...
##  $ LotArea      : num  8450 7420 6762 5400 11957 ...
##  $ Street       : Ord.factor w/ 3 levels "None"<"Grvl"<..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Alley        : Ord.factor w/ 3 levels "None"<"Grvl"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotShape     : Ord.factor w/ 4 levels "Reg"<"IR1"<"IR2"<..: 1 1 1 1 2 1 1 2 1 1 ...
##  $ LandContour  : Factor w/ 4 levels "Lvl","Bnk","HLS",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Utilities    : Ord.factor w/ 5 levels "None"<"ELO"<"NoSeWa"<..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ LotConfig    : Factor w/ 5 levels "Inside","Corner",..: 1 2 1 2 1 1 1 1 1 1 ...
##  $ LandSlope    : Ord.factor w/ 4 levels "None"<"Gtl"<"Mod"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 4 6 18 22 6 11 17 20 8 ...
##  $ Condition1   : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 5 3 3 3 3 3 ...
##  $ Condition2   : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 3 3 3 3 3 3 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2FmCon",..: 1 2 1 1 1 1 4 1 1 3 ...
##  $ HouseStyle   : Factor w/ 8 levels "1Story","1.5Fin",..: 4 3 1 1 1 8 4 1 2 1 ...
##  $ OverallQual  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 5 7 5 8 5 4 6 5 5 ...
##  $ OverallCond  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 6 5 6 5 8 4 7 5 5 ...
##  $ YearBuilt    : int  2003 1939 2006 1920 2006 1977 1970 1977 1948 1965 ...
##  $ YearRemodAdd : int  2003 1950 2006 1950 2006 1977 1970 2001 1950 1965 ...
##  $ RoofStyle    : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 4 ...
##  $ RoofMatl     : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior1st  : Factor w/ 17 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
##  $ Exterior2nd  : Factor w/ 18 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
##  $ MasVnrType   : Factor w/ 5 levels "BrkCmn","BrkFace",..: 2 4 5 4 2 2 4 2 4 4 ...
##  $ MasVnrArea   : num  196 0 24 0 53 220 0 28 0 0 ...
##  $ ExterQual    : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 3 4 4 3 3 3 3 ...
##  $ ExterCond    : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Foundation   : Factor w/ 6 levels "BrkTil","CBlock",..: 3 1 3 1 3 2 2 3 2 2 ...
##  $ BsmtQual     : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 5 4 5 3 5 5 4 4 4 1 ...
##  $ BsmtCond     : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 5 4 4 4 1 ...
##  $ BsmtExposure : Ord.factor w/ 5 levels "None"<"No"<"Mn"<..: 2 2 4 2 2 4 2 3 2 1 ...
##  $ BsmtFinType1 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 7 7 7 2 7 7 5 6 2 1 ...
##  $ BsmtFinSF1   : num  706 851 686 0 24 595 273 1200 0 0 ...
##  $ BsmtFinType2 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 2 2 2 2 2 2 3 2 2 1 ...
##  $ BsmtFinSF2   : num  0 0 0 0 0 0 273 0 0 0 ...
##  $ BsmtUnfSF    : num  150 140 501 691 1550 390 0 410 720 0 ...
##  $ TotalBsmtSF  : num  856 991 1187 691 1574 ...
##  $ Heating      : Factor w/ 7 levels "None","Floor",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ HeatingQC    : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 6 6 6 6 6 4 4 5 4 4 ...
##  $ CentralAir   : Ord.factor w/ 2 levels "N"<"Y": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Electrical   : Factor w/ 6 levels "None","SBrkr",..: 2 2 2 3 2 2 2 2 2 2 ...
##  $ X1stFlrSF    : num  856 1077 1208 691 1574 ...
##  $ X2ndFlrSF    : num  854 0 0 0 0 0 546 0 551 0 ...
##  $ LowQualFinSF : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num  1710 1077 1208 691 1574 ...
##  $ BsmtFullBath : int  1 1 1 0 0 0 0 1 0 0 ...
##  $ BsmtHalfBath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 1 2 1 2 2 1 2 2 2 ...
##  $ HalfBath     : int  1 0 0 0 0 0 1 0 0 0 ...
##  $ BedroomAbvGr : int  3 2 2 2 3 3 3 3 4 4 ...
##  $ KitchenAbvGr : int  1 2 1 1 1 1 1 1 1 2 ...
##  $ KitchenQual  : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 5 4 3 3 4 3 3 ...
##  $ TotRmsAbvGrd : int  8 5 6 4 7 6 6 6 7 8 ...
##  $ Functional   : Ord.factor w/ 8 levels "Sal"<"Sev"<"Maj2"<..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ Fireplaces   : int  0 2 0 0 1 0 0 2 1 0 ...
##  $ FireplaceQu  : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 4 1 1 5 1 1 4 5 1 ...
##  $ GarageType   : Factor w/ 7 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 2 7 ...
##  $ GarageYrBlt  : int  2003 1939 2006 1920 2006 1977 1970 1977 1948 NA ...
##  $ GarageFinish : Ord.factor w/ 4 levels "None"<"Unf"<"RFn"<..: 3 3 3 2 3 2 3 3 2 1 ...
##  $ GarageCars   : int  2 1 2 1 3 1 1 2 1 0 ...
##  $ GarageArea   : num  548 205 632 216 824 328 286 480 312 0 ...
##  $ GarageQual   : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 5 4 3 4 4 4 4 4 1 ...
##  $ GarageCond   : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 4 4 4 4 1 ...
##  $ PavedDrive   : Ord.factor w/ 4 levels "None"<"N"<"P"<..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ WoodDeckSF   : num  0 0 105 0 144 210 238 168 0 0 ...
##  $ OpenPorchSF  : num  61 4 61 20 104 0 0 68 0 0 ...
##  $ EnclosedPorch: num  0 0 0 94 0 0 0 0 108 0 ...
##  $ X3SsnPorch   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ScreenPorch  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Fence        : Factor w/ 5 levels "GdPrv","MnPrv",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ MiscFeature  : Factor w/ 6 levels "Elev","Gar2",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ MiscVal      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MoSold       : int  2 1 2 1 7 11 8 2 8 5 ...
##  $ YrSold       : int  2008 2008 2010 2007 2008 2008 2009 2010 2008 2010 ...
##  $ SaleType     : Factor w/ 10 levels "WD","CWD","VWD",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SaleCondition: Factor w/ 6 levels "Normal","Abnorml",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ SalePrice    : num  208500 118000 206000 86000 232000 ...

There are 715 total observations in the validation training set, across 81 features (target feature “Saleprice” and id column “Id” included). 46 features are factors, 24 of which are ordered. 14 are integers. 20 are doubles, including “SalePrice”. (“Id” is integers cast as a character type.)

3.1 Correlations

The full correlation grid is too large for most screens, but there are only a handful of noteworthy correlations which I’ll include with further analysis of each feature.

# ggcorr(
#   select(val_train_Xy, where(is.numeric)),
#   label = T,
#   label_round = 2,
#   label_size = 3
# )

3.2 Normalizing Continuous Variables

I wrote a simple algorithm to try various transformations on each continuous feature and use the Shapiro-Wilk Normality Test to choose the best transformation. I then visualize each feature and decide if it even makes sense to attempt normalization.

I should have made logarithmic transformations be that of x+1, but instead I excluded 0s, which only partially handled the issue. 1s still convert to 0s in that case. The result was that variables with a substantial number of 1s did not find logs very useful for normalization.

I also did not include more dynamic transformations like Box-Cox. The script could be modified to include them.

For variables in which a 0 indicates a missing feature, I normalized only the non-zero set. The idea was that it might aid regression when the variable is put into interaction with its missingness.

funcs_lst = list(
    'no_func' = function (x) { x },
    'sqrt' = sqrt,
    'cbrt' = function(x) { x^(1/3) },
    'square' = function(x) { x^2 },
###
### FIXME
# Make log transformations of x+1.
###
    'log' = log,
    'log2' = log2,
    'log10' = log10,
    '1/x' = function (x) { 1/x },
    '2^(1/x)' = function (x) { 2^(1/x) }
    # Box Cox: write function that calls MASS::boxcox and include lambda in results/function returned.
    # Yeo-Johnson
    # Winsorize here?
    # Rank
    # Rank-Gauss
  )
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

print("Best normalizing transformations:")
## [1] "Best normalizing transformations:"
for (feat in names(best_normalizers)) {
  func_name = best_normalizers[[feat]]$best_func$name
  print(
    paste(
      feat, ":", func_name,
      ", p-value:", best_normalizers[[feat]]$results[[func_name]]$p.value
    )
  )
}
## [1] "LotFrontage : log10 , p-value: 0.896193826437694"
## [1] "LotArea : log10 , p-value: 2.26152919548748e-17"
## [1] "YearBuilt : square , p-value: 0.0108822707033521"
## [1] "YearRemodAdd : no_func , p-value: 0.0256596409501691"
## [1] "MasVnrArea : cbrt , p-value: 0.954476102599975"
## [1] "BsmtFinSF1 : cbrt , p-value: 4.83197463170721e-05"
## [1] "BsmtFinSF2 : cbrt , p-value: 0.576669464478684"
## [1] "BsmtUnfSF : cbrt , p-value: 0.00100793900881088"
## [1] "TotalBsmtSF : log , p-value: 2.08019611846809e-06"
## [1] "X1stFlrSF : log2 , p-value: 0.0309515511380178"
## [1] "X2ndFlrSF : sqrt , p-value: 0.580431655244041"
## [1] "LowQualFinSF : sqrt , p-value: 0.12996282240508"
## [1] "GrLivArea : log2 , p-value: 0.0129611332961232"
## [1] "BsmtFullBath :  , p-value: "
## [1] "BsmtHalfBath :  , p-value: "
## [1] "FullBath :  , p-value: "
## [1] "HalfBath :  , p-value: "
## [1] "BedroomAbvGr : sqrt , p-value: 0.995915265851324"
## [1] "KitchenAbvGr :  , p-value: "
## [1] "TotRmsAbvGrd : no_func , p-value: 0.972016654577295"
## [1] "Fireplaces :  , p-value: "
## [1] "GarageYrBlt : square , p-value: 0.00823549702716429"
## [1] "GarageCars : no_func , p-value: 0.971877057620897"
## [1] "GarageArea : sqrt , p-value: 0.203272774347792"
## [1] "WoodDeckSF : cbrt , p-value: 0.991997742575695"
## [1] "OpenPorchSF : cbrt , p-value: 0.898709370969093"
## [1] "EnclosedPorch : no_func , p-value: 0.526291715345031"
## [1] "X3SsnPorch : sqrt , p-value: 0.516253205278421"
## [1] "ScreenPorch : cbrt , p-value: 0.608040279289975"
## [1] "PoolArea :  , p-value: "
## [1] "MiscVal : log2 , p-value: 0.280656484036341"
## [1] "MoSold : no_func , p-value: 0.875731443365881"
## [1] "YrSold : no_func , p-value: 0.96717393596804"
## [1] "SalePrice : log , p-value: 0.72923438230646"

3.3 SalePrice

Back to top.

3.3.1 Normalize

x = 'SalePrice'
summary(val_train_Xy[[x]])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39300  129950  160000  181697  212500  745000
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 5000,
  t_binw = 1/50
)

## NULL
val_train_Xy = val_train_Xy %>%
  mutate('log(SalePrice)' = log(SalePrice))

x = 'log(SalePrice)'

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  log(SalePrice) 
##  Min.   :10.58  
##  1st Qu.:11.77  
##  Median :11.98  
##  Mean   :12.03  
##  3rd Qu.:12.27  
##  Max.   :13.52
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/100,
  t_binw = 1/100
)

## NULL

A natural log best normalizes the sale price distribution. However, because it isn’t a log10 transformation, it won’t precisely scale the prediction errors proportionally to the sale price. I could simply use the log10 instead, which does a fair job at normalizing as well, but I want to stick with the “best” transformation to make the best model. So, when I run ML, I will write a custom summary function to train with which simply divides the error by the price (the log(SalePrice)) before calculating the RMSE.

3.3.2 Winsorize

Here I’m looking for the best Winsorization quantile values of the best transformation – the best according to Shapiro-Wilk p-values. I could have programmatically explored the space and returned the best result. But, I want to visualize it and explore the process itself. In future projects, I might choose to further automate this.

It should be noted that Winsorization of the target variable should only be used for training, not for testing. A log transformation can be reversed as a vectorized operation, but Winsorization can’t. Winsorization should improve the accuracy of the model, but would be cheating on the test.

qqnorm(y = val_train_Xy$SalePrice, ylab = 'SalePrice')
qqline(y = val_train_Xy$SalePrice, ylab = 'SalePrice')

qqnorm(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')
qqline(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')

Win_log_x = Winsorize(
  x = val_train_Xy[['log(SalePrice)']],
  probs = c(0.005, 0.995)
)

qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x, ylab = 'Win_log_x')

Win_raw_x = Winsorize(
  x = val_train_Xy[['SalePrice']],
  probs = c(0, 0.95)
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = val_train_Xy$SalePrice))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$SalePrice
## W = 0.86089, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log(SalePrice)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`log(SalePrice)`
## W = 0.9904, p-value = 0.0001299
print(shapiro.test(x = Win_log_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log_x
## W = 0.99062, p-value = 0.0001609
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.93275, p-value < 2.2e-16

A small Winsorization of the log best normalizes the variable (W = 0.99062). It doesn’t pass the test for normality (p < 0.01), but it is still better prepared for a linear regression.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(SalePrice)' = Winsorize(
      SalePrice,
      probs = c(0, 0.95),
      na.rm = T
    )
  ) %>%
  mutate(
    'Win(log(SalePrice))' = Winsorize(
      log(SalePrice),
      probs = c(0.005, 0.995),
      na.rm = T
    )
  )

3.3.3 Correlations

Here are the correlations between SalePrice and the rest of the variables, compared to those of the transformed variables. Transforming SalePrice resulted in minor increases of correlations to many other continuous features and some minor reductions of correlations, an overall minor and insignificant improvement. But, this is the first of the variables to be transformed.

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x = 'Win(log(SalePrice))'
x_lst = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))',
          'Win(SalePrice)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    SalePrice        log(SalePrice)     Win(log(SalePrice)) Win(SalePrice)    
##  Min.   :0.007746   Min.   :0.001275   Min.   :0.001406    Min.   :0.006338  
##  1st Qu.:0.124877   1st Qu.:0.130580   1st Qu.:0.131486    1st Qu.:0.116871  
##  Median :0.306100   Median :0.314440   Median :0.314346    Median :0.312263  
##  Mean   :0.310147   Mean   :0.322002   Mean   :0.322153    Mean   :0.317893  
##  3rd Qu.:0.520650   3rd Qu.:0.538367   3rd Qu.:0.539042    3rd Qu.:0.515569  
##  Max.   :0.689569   Max.   :0.687804   Max.   :0.686782    Max.   :0.686731
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

3.3.4 Hard Code

I’ll need to hard code the top and bottom limits for the engineering script to apply to the test set without leakage. And, I’ll need to drop Win(SalePrice).

I want to keep the transformed and non-Winsorized version though. In the case of the target variable, I’ll train on the Winsorized variable and test on the transformed because I can reverse the transformation. In the case of predictor variables, the Winsorized version will be good for basic linear regression without interactions, but not necessarily for KNN and RF which may be able to use the outliers for clustering and grouping.

x = 'Win(log(SalePrice))'

min_val = min(val_train_Xy[[x]])
max_val = max(val_train_Xy[[x]])
print(paste("min_val:", min_val))
## [1] "min_val: 10.9579480025541"
print(paste("max_val:", max_val))
## [1] "max_val: 13.2279465702719"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(log(SalePrice))' = Winsorize(
      .data[['log(SalePrice)']],
      minval = min_val,
      maxval = max_val
    )
  ) %>%
  select(-c('Win(SalePrice)'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1/50)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.3.5 SalePrice as Factor

To aid visualization, I’ll create a SalePrice factor with extremes and quartiles as levels.

val_train_Xy = val_train_Xy %>%
  mutate(
    'SalePrice.fact' = cut(
      x = SalePrice,
      breaks = quantile(x = SalePrice),
      include.lowest = T,
      ordered_result = T
    )
  )

summary(val_train_Xy$SalePrice.fact)
##  [3.93e+04,1.3e+05]   (1.3e+05,1.6e+05]  (1.6e+05,2.12e+05] (2.12e+05,7.45e+05] 
##                 179                 179                 178                 179

3.4 MSSubClass (Dwelling Type)

Back to top.

I’ll use log(SalePrice) to visualize against factors, rather than the Winsorized version.

x = 'MSSubClass'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "60"  "20"  "30"  "80"  "160" "50"  "70"  "120"

The most common class is 1-story newer than 1945 (267), followed by 2-story newer than 1945 (149) and 1.5-story finished all ages (67). The priciest class is 2-story newer than 1945, though some classes are so uncommon that it’s hard to say this completely confidently.

This feature is a mix of information mostly covered by HouseStyle, YearBuilt, and square footage. It might be worth dropping it to avoid overweighting this info, avoid spurious fit, and skip the compute cost of 16 one-hot features. Alternatively, decomposition with PCA might help pull out the unique information regarding PUD housing.

3.5 MSZoning

Back to top.

x = 'MSZoning'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "RL" "RM" "FV"

Mostly residential low density (574), some medium density (100), fewer floating village residential (flexible zoning, 31). Predictive power may be limited due to lack of diversity. That said, low-density residential and floating village clearly tend to sell for more than medium-density. Consider only one-hot encoding RL, RM, and FV?

3.6 LotFrontage

Back to top.

3.6.1 Normalize

x = 'LotFrontage'

summary(val_train_Xy[[x]])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   58.00   70.00   70.04   80.00  313.00     133
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 5,
  t_binw = 1/50
)

## NULL

A log10 scale centers it better (133 missing values excluded).

Note the extreme spike (~65 observations) in left of mode (70-75 SF) at 60 SF. It doesn’t seem to be associated with any particular neighborhood or lot configuration or anything, but probably just a common way to cut lots.

The feature could benefit from top/bottom coding.

133 NAs. Counterintuitively, a lower proportion of missing LotFrontages are inside lots (55/121 in the NA subset vs. 511/715 in the training set [these numbers are from a previous split and not accurate for the current data set]), whereas many lots that by definition have frontage (44 corner lots, FR2, and FR3) are missing frontage values. Could use LotArea, LotShape, LotConfig, and (?) (all of which aren’t missing values) to multivariate impute.

x = 'log10(LotFrontage)'
val_train_Xy = val_train_Xy %>%
  mutate(
    'log10(LotFrontage)' = ifelse(
    LotFrontage == 0,
    0,
    log10(LotFrontage)
    )
  )

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  log10(LotFrontage)
##  Min.   :1.322     
##  1st Qu.:1.763     
##  Median :1.845     
##  Mean   :1.820     
##  3rd Qu.:1.903     
##  Max.   :2.496     
##  NA's   :133
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/50,
  t_binw = 1/50
)

## NULL

3.6.2 Winsorize

qqnorm(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')
qqline(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')

qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)

Win_log10_x = Winsorize(
  x = val_train_Xy[[x]],
  probs = c(0.05, 0.99),
  na.rm = T
)

qqnorm(y = Win_log10_x, ylab = 'Win_log10_x')
qqline(y = Win_log10_x, ylab = 'Win_log10_x')

Win_raw_x = Winsorize(
  x = val_train_Xy$LotFrontage,
  probs = c(0.05, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win(LotFrontage)')
qqline(y = Win_raw_x, ylab = 'Win(LotFrontage)')

print(shapiro.test(x = val_train_Xy$LotFrontage))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$LotFrontage
## W = 0.87621, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy[[x]]))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy[[x]]
## W = 0.94238, p-value = 3.051e-14
print(shapiro.test(x = Win_log10_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log10_x
## W = 0.97393, p-value = 1.16e-08
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.9771, p-value = 6.718e-08

It looks like just Winsorizing the raw variable may be the way to go here.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(LotFrontage)' = Winsorize(
      LotFrontage,
      probs = c(0.05, 0.95),
      na.rm = T
      )
    ) %>%
  mutate(
    'Win(log10(LotFrontage))' = Winsorize(
      log10(LotFrontage),
      probs = c(0.05, 0.99),
      na.rm = T
      )
    )

3.6.3 Correlations

x = 'Win(LotFrontage)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotFrontage', 'log10(LotFrontage)', 'Win(log10(LotFrontage))', 'Win(LotFrontage)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   LotFrontage       log10(LotFrontage) Win(log10(LotFrontage))
##  Min.   :0.001316   Min.   :0.002122   Min.   :0.002136       
##  1st Qu.:0.052789   1st Qu.:0.036725   1st Qu.:0.036057       
##  Median :0.152575   Median :0.123545   Median :0.121036       
##  Mean   :0.189170   Mean   :0.163383   Mean   :0.162507       
##  3rd Qu.:0.318210   3rd Qu.:0.285571   3rd Qu.:0.300117       
##  Max.   :0.515867   Max.   :0.466945   Max.   :0.430664       
##  Win(LotFrontage) 
##  Min.   :0.00368  
##  1st Qu.:0.05062  
##  Median :0.13177  
##  Mean   :0.17398  
##  3rd Qu.:0.31994  
##  Max.   :0.44127
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.6.4 Hard Code

x = 'Win(LotFrontage)'

min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
## [1] "min_val: 34.05"
print(paste("max_val:", max_val))
## [1] "max_val: 108"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(LotFrontage)' = Winsorize(
      LotFrontage,
      minval = min_val,
      maxval = max_val
    )
  )

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.6.5 By Factors

y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
          'BldgType', 'HouseStyle')
for (y in y_lst) {
  plt = fenced_jbv(
    data = val_train_Xy, x = y, y = 'log10(LotFrontage)') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(plt)
}

Overall, the clusters of lots at 60’ and 80’ is quite apparent.

Unsurprisingly, low-density residential tends to have more lot frontage than medium-density residential. Slightly irregular lots might tend to have more frontage than regular, but we can’t say that with much confidence. Corner lots have more frontage than inside lots. There’s quite a bit of variation between neighborhoods.

Looking at MSSubClass, older homes tend to have less lot frontage than their equivalent house styles after 1945, unless they are PUD homes, which tend to have much less frontage than the rest of the classes. This connection is not visible in YearBuilt, which has no correlation to LotFrontage.

There’s also an interesting gap between 50’ and 60’ that only homes older than 1945 tend to fill, except for PUD homes.

fenced_jbv(
  data = val_train_Xy,
  x = 'MSSubClass',
  y = 'log10(LotFrontage)',
  jit_col = 'SalePrice.fact',
  leg_lbl = 'SalePrice',
  jit_alpha = 0.5,
  box_color = 'red'
)

ggplot(val_train_Xy, aes(x = `log10(LotFrontage)`, y = `log(SalePrice)`)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = 'lm') +
  facet_wrap(vars(MSSubClass), ncol = 5)

3.7 LotArea

Back to top.

3.7.1 Normalize

x = 'LotArea'
summary(val_train_Xy[[x]])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1596    7610    9477   10346   11611  115149
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 200,
  t_binw = 1/50
)

## NULL
x_trans = 'log10(LotArea)'
val_train_Xy = val_train_Xy %>%
  mutate('log10(LotArea)' = log10(LotArea))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x_trans])
##  log10(LotArea) 
##  Min.   :3.203  
##  1st Qu.:3.881  
##  Median :3.977  
##  Mean   :3.960  
##  3rd Qu.:4.065  
##  Max.   :5.061
sum_and_trans_cont(
  data = val_train_Xy,
  x = x_trans,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/50,
  t_binw = 1/500
)

## NULL
x_trans = 'log10(log10(LotArea))'
val_train_Xy = val_train_Xy %>%
  mutate('log10(log10(LotArea))' = log10(log10(LotArea)))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x_trans])
##  log10(log10(LotArea))
##  Min.   :0.5056       
##  1st Qu.:0.5890       
##  Median :0.5995       
##  Mean   :0.5970       
##  3rd Qu.:0.6090       
##  Max.   :0.7043
sum_and_trans_cont(
  data = val_train_Xy,
  x = x_trans,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/500,
  t_binw = 1/750
)

## NULL

I doubt it’s worth doing the third log10 transformation now that the median and mean are so close. It still needs top- and bottom-coding anyway.

Even the second transformation might lead to overfit, but I’ll roll with it.

3.7.2 Winsorize

qqnorm(y = val_train_Xy$LotArea, ylab = 'LotArea')
qqline(y = val_train_Xy$LotArea, ylab = 'LotArea')

qqnorm(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')
qqline(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')

qqnorm(
  y = val_train_Xy$`log10(log10(LotArea))`,
  ylab = 'log10(log10(LotArea))'
)
qqline(
  y = val_train_Xy$`log10(log10(LotArea))`,
  ylab = 'log10(log10(LotArea))'
)

Win_log10log10_x = Winsorize(
  x = val_train_Xy$`log10(log10(LotArea))`,
  probs = c(0.05, 0.99),
  na.rm = T
)

qqnorm(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')
qqline(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')

Win_log10_x = Winsorize(
  x = val_train_Xy$`log10(LotArea)`,
  probs = c(0.05, 0.99),
  na.rm = T
)

qqnorm(y = Win_log10_x, ylab = 'Win(log10(LotArea))')
qqline(y = Win_log10_x, ylab = 'Win(log10(LotArea))')

Win_raw_x = Winsorize(
  x = val_train_Xy$LotArea,
  probs = c(0.01, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'LotArea')
qqline(y = Win_raw_x, ylab = 'LotArea')

print(shapiro.test(x = val_train_Xy$LotArea))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$LotArea
## W = 0.5211, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(LotArea)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`log10(LotArea)`
## W = 0.9113, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(log10(LotArea))`))
## 
##  Shapiro-Wilk normality test
## 
## data:  val_train_Xy$`log10(log10(LotArea))`
## W = 0.90238, p-value < 2.2e-16
print(shapiro.test(x = Win_log10log10_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log10log10_x
## W = 0.94232, p-value = 4.825e-16
print(shapiro.test(x = Win_log10_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log10_x
## W = 0.94434, p-value = 9.78e-16
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.97759, p-value = 5.239e-09

It looks like simply Winsorizing the base variable might be best.

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(LotArea)' = Winsorize(
      LotArea,
      probs = c(0.01, 0.95),
      na.rm = T
      )
    ) %>%
  mutate(
    'Win(log10(LotArea))' = Winsorize(
      log10(LotArea),
      probs = c(0.05, 0.99),
      na.rm = T
      )
    )

3.7.3 Correlations

Transforming LotArea with log10 resulted in bigger swings in r in both directions, but no real change in aggregate. The additional log10 transformation produced minor changes in correlation compared to the initial transformation, mostly toward less correlation.

Unsurprisingly, transformed LotArea is much more correlated to transformed LotFrontage than their untransformed counterparts (r went from 0.51 to 0.73). Also, transformed LotArea and transformed SalePrice correlate a little better than their untransformed counterparts, but are still weakly correlated (r increased from 0.30 to 0.37).

It also became noticeably more correlated to square footage of the first floor, bedrooms / total rooms above ground, and garage area/cars. Like previous transformations, some correlations lessened with this transformation, but none so much that the correlation dropped a bracket, e.g. from weak to insignificant.

The distribution of correlations dropped with the second transformation. I’ll only use the first transformation in the engineering script.

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotArea', 'log10(LotArea)', 'log10(log10(LotArea))', 'Win(log10(LotArea))', 'Win(LotArea)')

x = 'Win(LotArea)'

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(df)
##     LotArea          log10(LotArea)     log10(log10(LotArea))
##  Min.   :0.0008156   Min.   :0.001463   Min.   :0.001822     
##  1st Qu.:0.0298980   1st Qu.:0.038420   1st Qu.:0.038591     
##  Median :0.1411549   Median :0.128146   Median :0.125858     
##  Mean   :0.1692131   Mean   :0.210784   Mean   :0.208681     
##  3rd Qu.:0.2935533   3rd Qu.:0.351718   3rd Qu.:0.343608     
##  Max.   :0.5149010   Max.   :0.728610   Max.   :0.741522     
##  Win(log10(LotArea))  Win(LotArea)      
##  Min.   :0.005534    Min.   :0.0001219  
##  1st Qu.:0.045043    1st Qu.:0.0704055  
##  Median :0.124050    Median :0.1270071  
##  Mean   :0.216047    Mean   :0.2244330  
##  3rd Qu.:0.359245    3rd Qu.:0.3686740  
##  Max.   :0.697739    Max.   :0.6861456
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.7.4 Hard Code

x = 'Win(LotArea)'

min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
## [1] "min_val: 1975.96"
print(paste("max_val:", max_val))
## [1] "max_val: 16946.4"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(LotArea)' = Winsorize(
      LotArea,
      minval = min_val,
      maxval = max_val
    )
  ) %>%
  select(-c('log10(LotArea)', 'Win(log10(LotArea))'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 100)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

y_lst = c('log(SalePrice)', 'Win(LotFrontage)', 'X1stFlrSF',
          'GrLivArea','TotRmsAbvGrd', 'GarageArea')
x = 'Win(LotArea)'
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

3.7.5 By Factors

y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
          'BldgType', 'HouseStyle')
for (y in y_lst) {
  plt = fenced_jbv(
    data = val_train_Xy,
    x = y,
    y = 'log10(log10(LotArea))',
    jit_h = 0 # Again R randomly decides to go wonk unless I enter the default.
  ) +
    theme(axis.text.x = element_text(angle = 45, hjust=1))
  print(plt)
}

There are similar patterns as in LotFrontage, with a more marked upward trend against LotShape, and with less difference between lot configurations. There’s also an interesting pocket of two-story houses with low lot area; it’s not worth plotting, but you can see which neighborhoods these are.

3.8 Street

Back to top.

Really lopsided to paved (3 gravel, 712 paved). Drop this feature.

summary(val_train_Xy$Street)
## None Grvl Pave 
##    0    3  712
val_train_Xy = select(val_train_Xy, -c('Street'))

3.9 Alley

Back to top.

Vast majority have none, drop it.

summary(val_train_Xy$Alley)
## None Grvl Pave 
##  669   24   22
val_train_Xy = select(val_train_Xy, -c('Alley'))

3.10 LotShape

Back to top.

x = 'LotShape'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Reg" "IR1"

Irregularly shaped lots tend to sell for more. Good candidate for binarization if doing a basic linear regression with no interactions; one-hot encode ‘Reg’ and drop the rest of the levels.

3.11 LandContour

Back to top.

summary(val_train_Xy$LandContour)
## Lvl Bnk HLS Low 
## 642  33  24  16
val_train_Xy = select(val_train_Xy, -c('LandContour'))

3.12 Utilities

Back to top.

All all-public. Definitely drop.

summary(val_train_Xy$Utilities)
##   None    ELO NoSeWa NoSewr AllPub 
##      0      0      0      0    715
val_train_Xy = select(val_train_Xy, -c('Utilities'))

3.13 LotConfig

Back to top.

x = 'LotConfig'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Inside"  "Corner"  "CulDSac"

This set is mostly inside lots (513) but plenty of corner lots (124). I’ve always thought corner lots are prized, but it doesn’t seem to significantly add to price compared to an inside lot.

Cul de sacs are the priciest. This is somewhat a proxy for neighborhood (and maybe other features) as it is true across most neighborhoods except those that are already pricey (where cul de sacs are relatively more common) or least pricey (where cul de sacs don’t typically exist).

ggplot(
  data = val_train_Xy,
  # data = filter(
  #   val_train_Xy,
  #   LotConfig %in% c('Inside', 'Corner', 'CulDSac')
  # ),
  mapping = aes(
    x = Neighborhood,
    y = `log(SalePrice)`,
  )
) +
  geom_jitter(
    alpha = .4,
    mapping = aes(color = LotConfig, shape = LotConfig)
  ) +
  geom_boxplot(notch = T, varwidth = T, alpha = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(
  data = val_train_Xy,
  mapping = aes(
    x = Neighborhood,
    y = `log(SalePrice)`,
  )
) +
  geom_jitter(
    alpha = .3,
    mapping = aes(color = LotShape, shape = LotShape)
  ) +
  geom_boxplot(notch = T, varwidth = T, alpha = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(
  data = val_train_Xy,
  mapping = aes(
    x = LotConfig,
    y = `log(SalePrice)`,
  )
) +
  geom_jitter(
    alpha = .4,
    mapping = aes(color = LotShape, shape = LotShape)
  ) +
  geom_boxplot(notch = T, varwidth = T, alpha = 0)

ggplot(
  data = val_train_Xy,
  mapping = aes(
    x = LotShape,
    y = `log(SalePrice)`,
  )
) +
  geom_jitter(
    alpha = .4,
    mapping = aes(color = LotConfig, shape = LotConfig)
  ) +
  geom_boxplot(notch = T, varwidth = T, alpha = 0)

There appears to be some overlap with lot shape and lot configuration.

3.14 LandSlope

Back to top.

Vast majority are gentle drop it.

summary(val_train_Xy$LandSlope)
## None  Gtl  Mod  Sev 
##    0  677   32    6
val_train_Xy = select(val_train_Xy, -c('LandSlope'))

3.15 Neighborhood

Back to top.

x = 'Neighborhood'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    keysize = 2
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
##  [1] "CollgCr" "BrkSide" "OldTown" "Somerst" "NWAmes"  "Sawyer"  "Edwards"
##  [8] "Crawfor" "NridgHt" "Gilbert" "Names"

North Ames (Names) and CollgCr have the most residential homes (107 and 76). but they’re also zoned low-density. A handful of neighborhoods have almost no houses in this set.

The priciest neighborhoods are StoneBr, NridgeHt, and NoRidge. The least pricey (OldTown, MeadowV, and IDOTRR) are also the most dense and commercial. There are significant differences in prices between many neighborhoods, and not just between the cheapest and priciest.

ggplot(
  data = val_train_Xy,
  aes(x = Neighborhood, fill = MSZoning)
) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.16 Condition1, Condition2

Back to top.

The vast majority are normal. Drop Condition2. But, for Condition1, there appear to be significant differences in SalePrice between Norm and Feedr, and maybe between Norm and Artery but there are too few Artery observations. It might be worth one-hot-encoding those categories.

x = 'Condition1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Norm"  "Feedr"

You might consider binarizing, lumping ‘Feedr’ and ‘Artery’ and maybe ‘RRAe’ together and lumping the rest with ‘Norm’. I’ll try it out during feature selection with caret in the ML phase.

summary(val_train_Xy$Condition2)
## Artery  Feedr   Norm   RRNn   RRAn   PosN   PosA   RRNe   RRAe 
##      1      2    709      1      0      0      1      0      1
val_train_Xy = select(val_train_Xy, -c('Condition2'))

3.17 BldgType

Back to top.

x = 'BldgType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

Majority single-family (603), 24 NAs. It seems like an inherently important feature, despite the lopsidedness of the distribution. Probably safe to impute to mode (1Family), but other features might inform, such as MSSubClass, MSZoning, Neighborhood, HouseStyle, and building materials; multivariate impute might be in order, if keeping the feature in the first place.

Duplexes and two-family are significantly cheaper than single-family and townhouses, if accepting the low number in the sample. Candidate for binarization, but may lose interactions.

3.18 HouseStyle

Back to top.

x = 'HouseStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "2Story" "1Story" "SLvl"   "1.5Fin"

Mostly one-story (358), but many two-story (220). Several significant price differences across groups. Maybe worth keeping, but also kind of noisy with the finished/unfinished business only applying to half-stories, and number of stories and finished status are encoded in other features.

3.19 OverallQual

Back to top.

x = 'OverallQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "7" "5" "8" "4" "6"

There’s a very strong relationship to SalePrice.

3.19.1 Normalize

It’s a pretty normal distribution, slightly left-skewed. Mode (2199 5s) left of median/mean (180 6s), few 1s, 2s, and 3s. It might be worth casting as an integer and transforming to normalize and possibly improve the correlation.

x = 'OverallQual_int'

val_train_Xy = val_train_Xy %>%
  mutate(OverallQual_int = as.integer(OverallQual))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  OverallQual_int 
##  Min.   : 1.000  
##  1st Qu.: 5.000  
##  Median : 6.000  
##  Mean   : 6.092  
##  3rd Qu.: 7.000  
##  Max.   :10.000
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

None of the transformations improved its distribution.

3.19.2 Correlations

Here are the correlations between OverallQual_int and the rest of the variables.

This feature has several moderate correlations to features having to do with size and age. It also has a strong correlation to transformed SalePrice (0.82).

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##  OverallQual_int  
##  Min.   :0.01751  
##  1st Qu.:0.13440  
##  Median :0.27000  
##  Mean   :0.31424  
##  3rd Qu.:0.54465  
##  Max.   :0.82099
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF',
          'GrLivArea','FullBath', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars',
          'GarageArea')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

3.19.3 By Factors

Again, there’s a lot of interaction with MSSubClass and Neighborhood. Quality also decreases with zoning density. Two-story houses are generally rated higher than one-story houses. Vinyl gets rated highest among exteriors. Simply having masonry improves quality rating. Poured concrete foundations on average receive 7s, whereas cinder block and brick/tile receive 5s on average. The same is true for attached and built-in garages compared to detached and no garages. It almost looks like it’s better to have no fence than to have one with minor privacy. Court officer deeds/estates are rated more poorly than warranty deeds and new sales. Abnormal sales are rated lower than normal sales.

Overall quality is doing a lot of the work for other features toward predicting price.

y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle',
          'RoofMatl', 'RoofStyle', 'Exterior1st', 'MasVnrType', 'Foundation',
          'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
          'Fence', 'MiscFeature', 'SaleType', 'SaleCondition')
for (y in y_lst) {
  plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
    theme(axis.text.x = element_text(angle = 45, hjust=1))
  print(plt)
}

3.20 OverallCond

Back to top.

x = 'OverallCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "5" "6" "8" "4" "7"

OverallCond is like OverallQual, but with a much more pronounced mode (397 5s) left of median/mean (124 6s), probably due to wear and tear being more universal than quality construction.

The correlation to SalePrice is weaker, and 5s oddly seem to sell for more on average than higher-rated houses.

3.20.1 Normalize

x = 'OverallCond_int'

val_train_Xy = val_train_Xy %>%
  mutate(OverallCond_int = as.integer(OverallCond))

# Recalculate best normalizers. Might as well do them all, see if previous
# transformations benefit from further transformation, while we're at it.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  OverallCond_int
##  Min.   :1.00   
##  1st Qu.:5.00   
##  Median :5.00   
##  Mean   :5.55   
##  3rd Qu.:6.00   
##  Max.   :9.00
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.20.2 Correlations

This feature is only weakly correlated to a couple of age features. It has no linear correlation to SalePrice.

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##  OverallCond_int    
##  Min.   :0.0008734  
##  1st Qu.:0.0201988  
##  Median :0.0386786  
##  Mean   :0.0676805  
##  3rd Qu.:0.1009599  
##  Max.   :0.3591485
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

The bump in price among houses of average condition seems to have to do with a cluster of houses made in the late ’90s and early 2000s.

ggplot(
  data = val_train_Xy,
  mapping = aes(
    x = OverallCond_int,
    y = .data[['log(SalePrice)']],
    color = YearBuilt
  )
) +
  geom_jitter(alpha = 0.5) +
  geom_smooth() #+

  # facet_wrap(vars(BldgType))

3.20.3 By Factors

OldTown seems to be in relatively good shape, while Edwards has proportionately more houses in need of work and reconditioning. You can easily spot clustered 5s in the neighborhoods that likely grew up in the building boom of the 2000s.

y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'BldgType',
          'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType',
          'BsmtExposure', 'Foundation', 'Heating', 'Electrical', 'Functional',
          'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType',
          'SaleCondition')
for (y in y_lst) {
  plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
    theme(axis.text.x = element_text(angle = 45, hjust=1))
  print(plt)
}

Houses with metal and wood exteriors are typically in better condition than those with vinyl despite houses with vinyl siding typically being rated as higher quality and newer. Likewise, houses with cinder block foundations seem to fare better over time than those with poured concrete despite quality ratings, according to this data set, as do houses with detached garages compared to those with attached and built-in garages. Is this because older houses and lower-quality houses that are still standing are the ones that have received better maintenance?

p1 = ggplot(
  data = val_train_Xy,
  mapping = aes(
    y = OverallCond_int,
    x = Exterior1st,
    color = YearBuilt
  )
) +
  geom_jitter() +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

p2 = ggplot(
  data = val_train_Xy,
  mapping = aes(
    y = OverallCond_int,
    x = Foundation,
    color = YearBuilt
  )
) +
  geom_jitter() +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

p3 = ggplot(
  data = val_train_Xy,
  mapping = aes(
    y = OverallCond_int,
    x = GarageType,
    color = YearBuilt
  )
) +
  geom_jitter() +
  geom_boxplot(alpha = 0) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

grid.arrange(p1, p2, p3, ncol = 2)

3.21 YearBuilt

Back to top.

No transformations normalized the distribution. You can see the construction boom between WWII and the ’80s (with a dip in mid ’70s stagflation), followed by the relative explosion starting in the late ’90s and dropping with the housing crisis in the 2000s.

Consider grouping around modes into a factor and dummy coding to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well.

x = 'YearBuilt'
summary(val_train_Xy[x])
##    YearBuilt   
##  Min.   :1872  
##  1st Qu.:1954  
##  Median :1972  
##  Mean   :1971  
##  3rd Qu.:2000  
##  Max.   :2010
# sum_and_trans_cont(
#   data = val_train_Xy,
#   x = x,
#   func = best_normalizers[[x]]$best_func$func,
#   func_name = best_normalizers[[x]]$best_func$name,
#   x_binw = 1,
#   t_binw = 1
# )

ggplot(val_train_Xy, aes(x = YearBuilt)) +
  geom_histogram(binwidth = 1)

3.21.1 Correlations

x = 'YearBuilt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    YearBuilt        
##  Min.   :0.0009914  
##  1st Qu.:0.0534053  
##  Median :0.1616352  
##  Mean   :0.2334731  
##  3rd Qu.:0.3684433  
##  Max.   :0.8301510
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = colnames(select(val_train_Xy, where(is.numeric)))
for (y in y_lst) {
  for (x in x_lst) {
    plt = ggplot(
      select(val_train_Xy, all_of(c(x, y))),
      aes(x = .data[[x]], y = .data[[y]])
    ) +
      geom_jitter() +
      geom_smooth() +
      labs(x = x, y = y) +
      scale_x_continuous(breaks = seq(1880, 2010, 5)) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
    print(plt)
  }
}

3.21.2 By Factors

y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
  plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(plt)
}

3.21.3 A Story of Ames

Before looking at remodel year, this is the story I gather about houses purchased in this period based on the above visualizations of YearBuilt. I could try to weave the viz in with the narrative, but I’m not going to.

Houses and garages have gotten bigger over the years, with more bathrooms and less enclosed porch space. (I’m curious to see whether the increasing rarity of enclosed porch space led to it being a prized (priced) feature, or if it simply isn’t as valued now.) Newer homes tend to be valued more, and builders generally began to produce higher-quality houses.

Most of the houses built in the post-WWII boom were single-story, but that’s also when 2 1/2 stories became unheard-of. Split/multi-level and duplexes began to come on the scene. 1 1/2 stories had their heyday. Some of the houses from then and earlier make up the group of two-family conversions. It would be interesting to check remodel years to see when those conversions tended to take place.

As the century turned, prices grew exponentially. Two-story houses and PUD housing became more prevalent, and townhouses appeared for the first time. Zoning became less dense as new suburbs sprang up. Lots became more irregular. Cul de sacs and parks started to sprinkle in as feeder streets networked out. A handful of houses began to appear near railroads.

Even as masonry became a growing bling factor driving up overall quality, the age of plastic ushered in vinyl siding as the dominant exterior. The overall condition of houses from the turn of the century on is starkly average compared to older houses which are commonly in good shape. Checking against remodel years will likely explain some of this, but I’m curious to compare exterior conditions of different materials with regard to age.

The Lost Generation got brick and tile foundations. Boomers got cinder block and slabs. Gen x and Millenials got poured concrete. This enabled us to build into the sides of hills better and have more exposed basements with walkouts.

Most of the basementless houses were built during the post-WWII boom when ramblers were a common answer to the burgeoning dream of homeownership. Basement area continuously grew from then on, especially finished basement area as people began to live more underground. Kitchens and bedrooms began to be more common below ground.

Kitchens became more of a target for adding value with improved materials, construction, and appliances. Heating systems improved over the years as well. New electrical standards came into place in 1960.

When the ’80s hit, the automobiled society was in full swing, and it was rare to see less than a two-car garage built anymore, let alone a house without a garage. Rather than a detached secondary building, garages became part of the main house itself. It was more often a finished space for more than just housing and working on cars, but without the need for high-quality construction. Unpaved driveways were now out of the question, though.

After 2000, virtually no new houses had fences, at least none that were bought during the years this data set covers.

3.21.4 YearBuilt as Factor

I’ll condense the housing booms around their modes in a factor that I can one-hot encode to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well. To maintain the model going forward, new periods will have to be identified and added.

# Three periods: < 1945 < 1985ish < 2010
val_train_Xy = val_train_Xy %>%
  mutate(
    YearBuilt.fact = cut(
      x = YearBuilt,
      breaks = c(-Inf, 1944, 1986, Inf),
      ordered_result = T,
      labels = c('Before_1945', '1945_1986', 'After_1986')
    )
  )

p1 = ggplot(val_train_Xy, aes(x = YearBuilt)) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = seq(1880, 2010, 5)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

p2 = ggplot(val_train_Xy, aes(x = YearBuilt.fact)) +
  geom_bar()

grid.arrange(p1, p2)

x = 'YearBuilt.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "After_1986"  "Before_1945" "1945_1986"

3.22 Age

Back to top.

It might make sense to transform year built into the age of the house when bought. The years of sales is a small range, but it may add a touch more predictive information, especially for the newer houses.

3.22.1 Normalize

It seems to have added a couple of outliers, but applying a square-root transformation better centers it and removes the outliers.

0s don’t indicate a missing feature, so I want to include them in the search for the best normalizer.

val_train_Xy = val_train_Xy %>%
  mutate('Age' = (YrSold - YearBuilt))

x = 'Age'

# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(-1)
)

summary(val_train_Xy[x])
##       Age        
##  Min.   :  0.00  
##  1st Qu.:  8.00  
##  Median : 35.00  
##  Mean   : 36.83  
##  3rd Qu.: 55.00  
##  Max.   :136.00
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1/10
)

## NULL
x = 'sqrt(Age)'
val_train_Xy = val_train_Xy %>%
  mutate('sqrt(Age)' = sqrt(Age))

# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(-1)
)

summary(val_train_Xy[x])
##    sqrt(Age)     
##  Min.   : 0.000  
##  1st Qu.: 2.828  
##  Median : 5.916  
##  Mean   : 5.341  
##  3rd Qu.: 7.416  
##  Max.   :11.662
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/10,
  t_binw = 1/10
)

## NULL
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

3.22.2 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'Age', 'sqrt(Age)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    YearBuilt              Age            sqrt(Age)       
##  Min.   :0.0009914   Min.   :0.00325   Min.   :0.000536  
##  1st Qu.:0.0534053   1st Qu.:0.06282   1st Qu.:0.081377  
##  Median :0.1616352   Median :0.15871   Median :0.165492  
##  Mean   :0.2334731   Mean   :0.23488   Mean   :0.248951  
##  3rd Qu.:0.3684433   3rd Qu.:0.36838   3rd Qu.:0.359191  
##  Max.   :0.8301510   Max.   :0.82972   Max.   :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)', 'Win(log(SalePrice))', 'GarageArea', 'GarageCars',
          'EnclosedPorch', 'OpenPorchSF', 'OverallQual_int', 'OverallCond_int')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

There’s a small increase in linear correlation of Age to transformed SalePrice from YearBuilt (0.59 to 0.62). Correlations to some measures of size strengthened. The correlation to quality rose quite a bit (from 0.59 to 0.65) while the correlation to condition only rose slightly (0.36 to 0.37), remaining weak.

With YearBuilt, there’s still a polynomial appearance to the plot against log(SalePrice), but that’s smoothed out with sqrt(Age). YearRemodAdd may add clarity.

The concentration of average-condition houses among the youngest is still puzzling. It’s as if a few years of aging tends to improve the condition of a house. Perhaps owners tend to add cosmetic value in the first years. Maybe assessors use the youngest houses as the anchor by which to assess all others. Many of these houses were unfinished, which explains some but not all of this.

ggplot(
  val_train_Xy,
  aes(
    x = .data[['sqrt(Age)']],
    y = OverallCond_int,
    shape = SaleCondition,
    color = SaleCondition
  )
) +
  geom_jitter()

val_train_Xy = select(val_train_Xy, -c('Age'))

3.23 YearRemodAdd

Back to top.

x = 'YearRemodAdd'
summary(val_train_Xy[x])
##   YearRemodAdd 
##  Min.   :1950  
##  1st Qu.:1966  
##  Median :1994  
##  Mean   :1985  
##  3rd Qu.:2004  
##  Max.   :2010
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

There was a curiously large number of remodels in 1950, about 80 remodels that year compared to the peak of about 50 in the 2000s. These were all built before 1950, and no house has an earlier remodel year than 1950, and some houses built before 1950 have remodel years later than 1950.

I’ll treat houses with a 1950 remodel as if they have not had a remodel; they may have had an earlier remodel, but I’m guessing that those older remodels are of little added value at this point. Ames assessors may have had reason to bottom-code, but I’ll set remodel year to built year in years prior to 1950 for now and see if it helps or hinders analysis and prediction.

val_train_Xy = val_train_Xy %>%
  mutate(YearRemodAdd.uncode = ifelse(YearRemodAdd == 1950, YearBuilt, YearRemodAdd))
x = 'YearRemodAdd.uncode'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  YearRemodAdd.uncode
##  Min.   :1880       
##  1st Qu.:1966       
##  Median :1994       
##  Mean   :1982       
##  3rd Qu.:2004       
##  Max.   :2010
# sum_and_trans_cont(
#   data = val_train_Xy,
#   x = x,
#   func = best_normalizers[[x]]$best_func$func,
#   func_name = best_normalizers[[x]]$best_func$name,
#   x_binw = 1,
#   t_binw = 1
# )
ggplot(val_train_Xy, aes(x = .data[[x]])) +
  geom_histogram(binwidth = 1)

3.23.1 Correlations

This only added outliers and slightly weakened the correlation to SalePrice.

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'YearRemodAdd', 'YearRemodAdd.uncode')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    YearBuilt          YearRemodAdd      YearRemodAdd.uncode
##  Min.   :0.0009914   Min.   :0.004039   Min.   :0.001117   
##  1st Qu.:0.0534053   1st Qu.:0.055532   1st Qu.:0.063792   
##  Median :0.1616352   Median :0.152282   Median :0.145630   
##  Mean   :0.2421409   Mean   :0.203658   Mean   :0.198553   
##  3rd Qu.:0.3684433   3rd Qu.:0.294762   3rd Qu.:0.268835   
##  Max.   :0.9624094   Max.   :0.654782   Max.   :0.657064
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL

3.23.2 By Factors

y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
  plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(plt)
}

val_train_Xy = select(val_train_Xy, -c('YearRemodAdd.uncode'))

Removing the records in which there is no remodel (i.e. YearRemodAdd == YearBuilt or 1950) adds clarity. 368 rows were removed for no remodel, and remodels really started being recorded in increasing numbers starting in the ’90s, maybe a little in the late ’80s.

ggplot(
  val_train_Xy,
  aes(x = ifelse(YearRemodAdd == YearBuilt, NA, YearRemodAdd)
  )
) +
  geom_histogram(binwidth = 1)

3.23.3 YearRemodAdd as Factor

Because there are so many houses without remodels, I’ll split it into a factor, a level for each decade with the year 1950 conveniently lumped in with NAs.

x = 'YearRemodAdd.fact'
val_train_Xy = val_train_Xy %>%
  mutate(
    YearRemodAdd.fact = factor(
      cut(
        ifelse(
          YearRemodAdd == YearBuilt | is.na(YearRemodAdd),
          1949,
          YearRemodAdd
        ),
        breaks = c(1949, 1950, 1960, 1970, 1980, 1990, 2000, 2010),
        labels = c('None', '50s', '60s', '70s', '80s', '90s', '00s')
      )
    )
  ) #%>%
  # select(-c('YearRemodAdd'))

y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None" "00s"  "90s"

3.24 RoofStyle

Back to top.

Most are Gable (563), and many are Hip (139). Flat, Gambrel, and Mansard are neglible.

x = 'RoofStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gable" "Hip"

3.25 RoofMatl

Back to top.

Vast majority are composition shingle (702). Can drop this feature.

summary(val_train_Xy$RoofMatl)
## ClyTile CompShg Membran   Metal    Roll Tar&Grv WdShake WdShngl 
##       1     705       1       0       0       4       1       3
val_train_Xy = select(val_train_Xy, -c('RoofMatl'))

3.26 Exterior1st/2nd

Back to top.

Most popular class is vinyl (255 and 249), but wood, metal, and others represent significant classes. No ‘None’ in 2nd, so all houses in Ames have at least two types of siding?

x = 'Exterior1st'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"
x = 'Exterior2nd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"

3.27 MasVnrType

Back to top.

Most have none (430), but plenty of brick (219) and stone (66). No cinderblock in the split.

x = 'MasVnrType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "BrkFace" "None"    "Stone"

3.28 MasVnrArea

Back to top.

3.28.1 Normalize

x = 'MasVnrArea'
summary(val_train_Xy[x])
##    MasVnrArea    
##  Min.   :   0.0  
##  1st Qu.:   0.0  
##  Median :   0.0  
##  Mean   : 103.3  
##  3rd Qu.: 166.5  
##  Max.   :1129.0
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 10,
  t_binw = 0.25
)

## NULL
x = 'cbrt(MasVnrArea)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(MasVnrArea)' = MasVnrArea^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(MasVnrArea)
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 0.000  
##  Mean   : 2.391  
##  3rd Qu.: 5.501  
##  Max.   :10.413
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.25,
  t_binw = 0.25
)

## NULL

3.28.2 Winsorize

Because this variable’s 0s indicate a missing feature, we’re really concerned with Winsorizing non-zero values.

df = val_train_Xy[val_train_Xy$MasVnrArea != 0, ]

qqnorm(y = df$MasVnrArea, ylab = 'MasVnrArea')
qqline(y = df$MasVnrArea, ylab = 'MasVnrArea')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

qqnorm(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')
qqline(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')

Win_sqrt_x = Winsorize(
  x = sqrt(sqrt(df$MasVnrArea)),
  probs = c(0.005, 0.995),
  na.rm = T
)

qqnorm(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')
qqline(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')

Win_cbrt_x = Winsorize(
  x = df$`cbrt(MasVnrArea)`,
  probs = c(0.005, 0.995),
  na.rm = T
)

qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')

Win_raw_x = Winsorize(
  x = df$MasVnrArea,
  probs = c(0.05, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win(MasVnrArea)')
qqline(y = Win_raw_x, ylab = 'Win(MasVnrArea)')

print(shapiro.test(df$MasVnrArea))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$MasVnrArea
## W = 0.86407, p-value = 3.522e-15
print(shapiro.test(df$'cbrt(MasVnrArea)'))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$"cbrt(MasVnrArea)"
## W = 0.99498, p-value = 0.4769
print(shapiro.test(sqrt(sqrt(df$MasVnrArea))))
## 
##  Shapiro-Wilk normality test
## 
## data:  sqrt(sqrt(df$MasVnrArea))
## W = 0.99153, p-value = 0.09952
print(shapiro.test(Win_sqrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_sqrt_x
## W = 0.99525, p-value = 0.5281
print(shapiro.test(Win_cbrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_cbrt_x
## W = 0.99576, p-value = 0.6327
print(shapiro.test(Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.90384, p-value = 1.551e-12
min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(cbrt(MasVnrArea))' = ifelse(
      MasVnrArea == 0,
      0,
      Winsorize(
        x = `cbrt(MasVnrArea)`,
        # probs = c(0.005, 0.995),
        minval = min_val,
        maxval = max_val
      )
    )
  )

3.28.3 Correlations

x = 'Win(cbrt(MasVnrArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MasVnrArea', 'cbrt(MasVnrArea)', x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##    MasVnrArea      cbrt(MasVnrArea)  Win(cbrt(MasVnrArea))
##  Min.   :0.01480   Min.   :0.00778   Min.   :0.005013     
##  1st Qu.:0.06893   1st Qu.:0.07272   1st Qu.:0.071175     
##  Median :0.10170   Median :0.13331   Median :0.134064     
##  Mean   :0.14780   Mean   :0.15174   Mean   :0.151855     
##  3rd Qu.:0.23594   3rd Qu.:0.23006   3rd Qu.:0.230854     
##  Max.   :0.38455   Max.   :0.36920   Max.   :0.372665     
##  NA's   :1         NA's   :1         NA's   :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    MasVnrArea        cbrt(MasVnrArea)   Win(cbrt(MasVnrArea))
##  Min.   :0.0009883   Min.   :0.007318   Min.   :0.007091     
##  1st Qu.:0.0727252   1st Qu.:0.068257   1st Qu.:0.068430     
##  Median :0.1386111   Median :0.152985   Median :0.153200     
##  Mean   :0.1806451   Mean   :0.187874   Mean   :0.187854     
##  3rd Qu.:0.3070987   3rd Qu.:0.314667   3rd Qu.:0.314633     
##  Max.   :0.4212846   Max.   :0.430102   Max.   :0.430096
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
 plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst) 
}

This variable has a lot of zeros that indicate a missing feature that make it a candidate for binary encoding to aid linear regression. But, MasVnrType already encodes that in ‘None’. Because there’s a significant difference in price between MasVnrType levels, that should suffice. Also, leaving the 0s in improves the correlation to the target variable, so it may be moot.

3.28.4 Hard Code

# already hardcoded this one.

print(paste("min_val:", min_val))
## [1] "min_val: 1.52019153849196"
print(paste("max_val:", max_val))
## [1] "max_val: 10.278035603362"
ggplot(val_train_Xy, aes(x = .data[[x]])) +
  geom_histogram(binwidth = .2)

3.28.5 By Factors

y_lst = c('MasVnrType')
z = 'SalePrice.fact'
for (y in y_lst) {
  plt = fenced_jbv(
    data = val_train_Xy,
    x = y,
    y = 'cbrt(MasVnrArea)',
    jit_col = z,
    jit_alpha = 0.75,
    leg_lb = z,
    box_color = 'red'
  ) +
    theme(axis.text.x = element_text(angle = 45, hjust=1))
  print(plt)
}

ggplot(val_train_Xy, aes(x = `cbrt(MasVnrArea)`, y = `log(SalePrice)`)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = 'lm') +
  facet_wrap(vars(MasVnrType))

There are some houses with masonry footage but no type ascribed. I noticed this during the wrangling audit and decided to leave it for the modeling phase, maybe to impute the masonry type with caret using a select few variables as indicators or simply imputing the most common type (BrkFace), but maybe create a new level for those handful of undescribed masonry types.

3.29 ExterQual

Back to top.

Mostly average (440), many good (241).

x = 'ExterQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA"

3.30 ExterCond

Back to top.

Greater majority average (621), still some good (74). May be worth keeping.

x = 'ExterCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.31 Foundation

Back to top.

Evenly split between poured concrete and cinder block (317 and 302), but still 72 brick and tile.

x = 'Foundation'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "PConc"  "BrkTil" "CBlock"

Could drop Stone and Wood and make it an ordered factor to represent as ints in regression or to one-hot.

3.32 BsmtQual

Back to top.

Evenly split between average and good (313 and 303), but 63 excellent, and only 26 without basements. I’m having trouble imagining houses with basements and cinder block foundations.

x = 'BsmtQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA" "Ex"

3.33 BsmtCond

Back to top.

Vast majority average (635).

x = 'BsmtCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.34 BsmtExposure

Back to top.

Great majority (about 464) not exposed, but still some average (100), good (71), and minimal exposure (54).

x = 'BsmtExposure'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "No" "Av" "Mn" "Gd"

3.35 BsmtFinType1

Back to top.

211 are unfinished, 205 are top quality, and descending counts to low quality (34).

x = 'BsmtFinType1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "GLQ" "Unf" "BLQ" "ALQ" "LwQ" "Rec"

Probably just one-hot GLQ, LwQ, and None for regression.

3.36 BsmtFinSF1

Back to top.

3.36.1 Normalize

x = 'BsmtFinSF1'
summary(val_train_Xy[x])
##    BsmtFinSF1    
##  Min.   :   0.0  
##  1st Qu.:   0.0  
##  Median : 375.0  
##  Mean   : 446.5  
##  3rd Qu.: 699.0  
##  Max.   :5644.0
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = 0.25
)

## NULL
x = 'cbrt(BsmtFinSF1)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(BsmtFinSF1)' = BsmtFinSF1^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(BsmtFinSF1)
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 7.211  
##  Mean   : 5.564  
##  3rd Qu.: 8.875  
##  Max.   :17.804
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.25,
  t_binw = 0.25
)

## NULL
ggplot(val_train_Xy, aes(x = .data[['cbrt(BsmtFinSF1)']])) +
  geom_histogram()

3.36.2 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtFinSF1', 'cbrt(BsmtFinSF1)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    BsmtFinSF1       cbrt(BsmtFinSF1)  
##  Min.   :0.006172   Min.   :0.000255  
##  1st Qu.:0.079152   1st Qu.:0.069259  
##  Median :0.218996   Median :0.115415  
##  Mean   :0.220147   Mean   :0.157082  
##  3rd Qu.:0.318490   3rd Qu.:0.196450  
##  Max.   :0.644301   Max.   :0.643687
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)

## NULL
plot_scat_pairs(df = val_train_Xy, x = 'BsmtFinSF1', y_lst = y_lst)

## NULL

This will be dropped in favor of a derivative variable, total basement sf, so no need to go any further.

val_train_Xy = select(val_train_Xy, -c('cbrt(BsmtFinSF1)'))

3.37 BsmtFinType2

Back to top.

Mostly unfinished (609) but 27 no basements. So, only one house in Ames with a basement doesn’t have a second basement?? This doesn’t seem right. It might be worth dropping this feature.

x = 'BsmtFinType2'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)

3.38 BsmtFinSF2

Back to top.

No need to look further as this will be dropped in favor of total bsmt sf.

3.39 BsmtUnfSF

Back to top.

3.39.1 Normalize

x = 'BsmtUnfSF'
summary(val_train_Xy[x])
##    BsmtUnfSF     
##  Min.   :   0.0  
##  1st Qu.: 234.0  
##  Median : 470.0  
##  Mean   : 564.6  
##  3rd Qu.: 795.5  
##  Max.   :2046.0
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = 0.25
)

## NULL
x = 'cbrt(BsmtUnfSF)'
val_train_Xy = val_train_Xy %>%
  mutate('cbrt(BsmtUnfSF)' = BsmtUnfSF^(1/3))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  cbrt(BsmtUnfSF) 
##  Min.   : 0.000  
##  1st Qu.: 6.162  
##  Median : 7.775  
##  Mean   : 7.402  
##  3rd Qu.: 9.266  
##  Max.   :12.695
sum_and_trans_cont(
  data = filter(val_train_Xy, BsmtUnfSF != 0),
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 0.25,
  t_binw = 0.25
)

## NULL

3.39.2 Winsorize

Because this variable’s 0s indicate a missing basement, just Winsorizing non-zero values.

df = val_train_Xy[val_train_Xy$BsmtUnfSF != 0, ]

qqnorm(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')
qqline(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_cbrt_x = Winsorize(
  x = df[[x]],
  probs = c(0.05, 0.95),
  na.rm = T
)

qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')

Win_raw_x = Winsorize(
  x = df$BsmtUnfSF,
  probs = c(0, 0.95),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'BsmtUnfSF')
qqline(y = Win_raw_x, ylab = 'BsmtUnfSF')

print(shapiro.test(x = df$BsmtUnfSF))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$BsmtUnfSF
## W = 0.92795, p-value < 2.2e-16
print(shapiro.test(x = df$`cbrt(BsmtUnfSF)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$`cbrt(BsmtUnfSF)`
## W = 0.993, p-value = 0.003542
print(shapiro.test(x = Win_cbrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_cbrt_x
## W = 0.96978, p-value = 2.054e-10
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.93316, p-value < 2.2e-16

3.39.3 Correlations

x = 'cbrt(BsmtUnfSF)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtUnfSF', 'cbrt(BsmtUnfSF)')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##    BsmtUnfSF        cbrt(BsmtUnfSF)   
##  Min.   :0.007563   Min.   :0.007861  
##  1st Qu.:0.042330   1st Qu.:0.039367  
##  Median :0.137369   Median :0.128333  
##  Mean   :0.135120   Mean   :0.122909  
##  3rd Qu.:0.180533   3rd Qu.:0.165133  
##  Max.   :0.473352   Max.   :0.356293
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##    BsmtUnfSF        cbrt(BsmtUnfSF)   
##  Min.   :0.008363   Min.   :0.001348  
##  1st Qu.:0.055197   1st Qu.:0.045389  
##  Median :0.101256   Median :0.078388  
##  Mean   :0.127480   Mean   :0.108878  
##  3rd Qu.:0.156947   3rd Qu.:0.141344  
##  Max.   :0.534817   Max.   :0.539121
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.40 TotalBsmtSF

Back to top.

3.40.1 Normalize

x = 'TotalBsmtSF'
summary(val_train_Xy[x])
##   TotalBsmtSF    
##  Min.   :   0.0  
##  1st Qu.: 793.5  
##  Median : 981.0  
##  Mean   :1058.1  
##  3rd Qu.:1291.0  
##  Max.   :6110.0
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = 1/20
)

## NULL
x = 'log(TotalBsmtSF)'
val_train_Xy = val_train_Xy %>%
  mutate(
    'log(TotalBsmtSF)' = ifelse(
    TotalBsmtSF <= 0,
    0,
    log(TotalBsmtSF)
    )
  )

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  log(TotalBsmtSF)
##  Min.   :0.000   
##  1st Qu.:6.676   
##  Median :6.889   
##  Mean   :6.734   
##  3rd Qu.:7.163   
##  Max.   :8.718
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1/20,
  t_binw = 1
)

## NULL
x = 'square(log(TotalBsmtSF))'
val_train_Xy = val_train_Xy %>%
  mutate(
    'square(log(TotalBsmtSF))' = ifelse(
    TotalBsmtSF <= 0,
    0,
    log(TotalBsmtSF)^2
    )
  )

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  square(log(TotalBsmtSF))
##  Min.   : 0.00           
##  1st Qu.:44.58           
##  Median :47.45           
##  Mean   :46.77           
##  3rd Qu.:51.31           
##  Max.   :76.00
sum_and_trans_cont(
  data = val_train_Xy,
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.40.2 Winsorize

Just checking non-zero set.

df = val_train_Xy[val_train_Xy$TotalBsmtSF != 0, ]

qqnorm(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')
qqline(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_log_x_squared = Winsorize(
  x = df[[x]],
  probs = c(0.005, 0.995),
  na.rm = T
)

qqnorm(y = Win_log_x_squared, ylab = 'Win_log_x_squared')
qqline(y = Win_log_x_squared, ylab = 'Win_log_x_squared')

Win_raw_x = Winsorize(
  x = df$TotalBsmtSF,
  probs = c(0, 0.99),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')

print(shapiro.test(x = df$TotalBsmtSF))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$TotalBsmtSF
## W = 0.84661, p-value < 2.2e-16
print(shapiro.test(x= df$`square(log(TotalBsmtSF))`))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$`square(log(TotalBsmtSF))`
## W = 0.98181, p-value = 1.344e-07
print(shapiro.test(x = Win_log_x_squared))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_log_x_squared
## W = 0.99018, p-value = 0.0001361
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.9591, p-value = 5.387e-13
x = 'Win(square(log(TotalBsmtSF)))'

val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(square(log(TotalBsmtSF)))' = ifelse(
      TotalBsmtSF <= 0,
      0,
      Winsorize(
        x = log(TotalBsmtSF)^2,
        probs = c(0.005, 0.995),
        na.rm = T
      )
    )
  ) %>%
  mutate(
    'Win(log(TotalBsmtSF))' = ifelse(
      TotalBsmtSF <= 0,
      0,
      Winsorize(
        x = log(TotalBsmtSF),
        probs = c(0.005, 0.995),
        na.rm = T
      )
    )
  )

3.40.3 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtSF', 'log(TotalBsmtSF)', 'square(log(TotalBsmtSF))',
          'Win(square(log(TotalBsmtSF)))')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##   TotalBsmtSF       log(TotalBsmtSF)   square(log(TotalBsmtSF))
##  Min.   :0.001736   Min.   :0.001862   Min.   :0.004075        
##  1st Qu.:0.111764   1st Qu.:0.067788   1st Qu.:0.076938        
##  Median :0.352060   Median :0.155643   Median :0.226709        
##  Mean   :0.306997   Mean   :0.178743   Mean   :0.233972        
##  3rd Qu.:0.404087   3rd Qu.:0.229931   3rd Qu.:0.300477        
##  Max.   :0.823514   Max.   :0.999521   Max.   :0.965497        
##  Win(square(log(TotalBsmtSF)))
##  Min.   :0.003976             
##  1st Qu.:0.076883             
##  Median :0.226443             
##  Mean   :0.233493             
##  3rd Qu.:0.299516             
##  Max.   :0.966195
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##   TotalBsmtSF       log(TotalBsmtSF)   square(log(TotalBsmtSF))
##  Min.   :0.002517   Min.   :0.006178   Min.   :0.004722        
##  1st Qu.:0.143176   1st Qu.:0.128517   1st Qu.:0.135745        
##  Median :0.331959   Median :0.323649   Median :0.326473        
##  Mean   :0.315899   Mean   :0.310116   Mean   :0.313815        
##  3rd Qu.:0.407319   3rd Qu.:0.414363   3rd Qu.:0.418140        
##  Max.   :0.903203   Max.   :0.994652   Max.   :0.990654        
##  Win(square(log(TotalBsmtSF)))
##  Min.   :0.005277             
##  1st Qu.:0.136215             
##  Median :0.327097             
##  Mean   :0.314560             
##  3rd Qu.:0.420007             
##  Max.   :0.988335
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features') +
  xlab(label = 'Subset with no 0s')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

Looks to be some clustering here, two or three groups. Maybe those with and without a second basement, maybe basement types. I think regression will suss out what information is needed, but it’s worth noting.

The raw feature actually correlates much better with the target variable when 0s are included. It might be worth keeping the raw features for the Lasso regression to weed through. Normalizing apart from 0s may or may not help other regressions; it would be ideal for some kind of combo regression that splits/clusters then finds a linear regression, which is sort of what I’m attempting by feeding the binary version of these features to Lasso (where not already encoded by a factor). Normalizing apart for 0s is not likely to help decision trees much, but may help KNN a little by pulling 0s farther away and outliers closer.

3.40.4 Hard Code

x = 'Win(square(log(TotalBsmtSF)))'

min_val = min(Win_log_x_squared)
max_val = max(Win_log_x_squared)
print(paste("min_val:", min_val))
## [1] "min_val: 32.6597927030784"
print(paste("max_val:", max_val))
## [1] "max_val: 60.3486636755867"
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(square(log(TotalBsmtSF)))' = ifelse(
      TotalBsmtSF == 0,
      0,
      Winsorize(
        log(TotalBsmtSF)^2,
        # probs = c(0.005, 0.995),
        minval = min_val,
        maxval = max_val
      )
    )
  ) %>%
  select(-c('Win(log(TotalBsmtSF))', 'log(TotalBsmtSF)'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

3.40.5 Binarize

I can create a binary for basement when I one-hot encode during modeling, but I want to be able to use it for exploration here.

val_train_Xy = val_train_Xy %>%
  mutate('Bsmt.bin' = factor(ifelse(TotalBsmtSF == 0, 0, 1), ordered = T))

x = 'Bsmt.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)

## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 20)

heatmap.2(
    x = as.matrix(p_vals$pval_df),
    scale = 'none',
    Rowv = F,
    Colv = F,
    dendrogram = 'none',
    cellnote = format(p_vals$pval_df, digits = 2),
    notecex = 0.75,
    notecol = 'black',
    main = paste(y, 'p-values'),
    key = F
  )

print(
    paste(
      "Levels w/ significantly different",
      y,
      "than another level:"
    )
  )
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('Bsmt.bin'))

3.41 TotalBsmtFinSF

Back to top.

TotalBsmtSF - BsmtUnfSF

3.41.1 Normalize

x = 'TotalBsmtFinSF'
val_train_Xy = val_train_Xy %>%
  mutate('TotalBsmtFinSF' = TotalBsmtSF - BsmtUnfSF)

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  TotalBsmtFinSF  
##  Min.   :   0.0  
##  1st Qu.:   0.0  
##  Median : 456.0  
##  Mean   : 493.6  
##  3rd Qu.: 786.0  
##  Max.   :5644.0
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy[[x]] != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 50,
  t_binw = 1
)

## NULL
x = 'sqrt(TotalBsmtFinSF)'
val_train_Xy = val_train_Xy %>%
  mutate('sqrt(TotalBsmtFinSF)' = sqrt(TotalBsmtFinSF))

# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
  df = val_train_Xy,
  feats_lst = num_feats,
  funcs_lst = funcs_lst,
  exclude_vals = list(0)
)

summary(val_train_Xy[x])
##  sqrt(TotalBsmtFinSF)
##  Min.   : 0.00       
##  1st Qu.: 0.00       
##  Median :21.35       
##  Mean   :17.39       
##  3rd Qu.:28.04       
##  Max.   :75.13
sum_and_trans_cont(
  data = val_train_Xy[val_train_Xy[[x]] != 0, ],
  x = x,
  func = best_normalizers[[x]]$best_func$func,
  func_name = best_normalizers[[x]]$best_func$name,
  x_binw = 1,
  t_binw = 1
)

## NULL

3.41.2 Winsorize

Just the non-zero set.

x = 'sqrt(TotalBsmtFinSF)'
df = val_train_Xy[val_train_Xy$TotalBsmtFinSF != 0, ]

qqnorm(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')
qqline(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')

qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)

Win_sqrt_x = Winsorize(
  x = df[[x]],
  probs = c(0.03, 0.995),
  na.rm = T
)

qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')

Win_raw_x = Winsorize(
  x = df$TotalBsmtFinSF,
  probs = c(0.001, 0.99),
  na.rm = T
)

qqnorm(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')
qqline(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')

print(shapiro.test(x = df$TotalBsmtFinSF))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$TotalBsmtFinSF
## W = 0.83537, p-value < 2.2e-16
print(shapiro.test(x = df$`sqrt(TotalBsmtFinSF)`))
## 
##  Shapiro-Wilk normality test
## 
## data:  df$`sqrt(TotalBsmtFinSF)`
## W = 0.97064, p-value = 3.291e-08
print(shapiro.test(x = Win_sqrt_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_sqrt_x
## W = 0.99214, p-value = 0.01256
print(shapiro.test(x = Win_raw_x))
## 
##  Shapiro-Wilk normality test
## 
## data:  Win_raw_x
## W = 0.97166, p-value = 5.265e-08
val_train_Xy = val_train_Xy %>%
  mutate(
    'Win(sqrt(TotalBsmtFinSF))' = ifelse(
      TotalBsmtFinSF == 0,
      0,
      Winsorize(
        x = sqrt(TotalBsmtFinSF),
        # probs = c(0.01, 0.995),
        minval = min(Win_sqrt_x),
        maxval = max(Win_sqrt_x)
      )
    )
  ) %>%
  mutate(
    'Win(TotalBsmtFinSF)' = ifelse(
      TotalBsmtFinSF == 0,
      0,
      Winsorize(
        x = TotalBsmtFinSF,
        # probs = c(0.001, 0.99)
        minval = min(Win_raw_x),
        maxval = max(Win_raw_x)
      )
    )
  )

x = 'Win(sqrt(TotalBsmtFinSF))'

3.41.3 Correlations

num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtFinSF', 'sqrt(TotalBsmtFinSF)', x)

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    !is.na(.data[[x]])
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
##  TotalBsmtFinSF      sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
##  Min.   :0.0004767   Min.   :0.007371     Min.   :0.008608         
##  1st Qu.:0.0941765   1st Qu.:0.093248     1st Qu.:0.098192         
##  Median :0.2387013   Median :0.166897     Median :0.155155         
##  Mean   :0.2619773   Mean   :0.222351     Mean   :0.218137         
##  3rd Qu.:0.3211208   3rd Qu.:0.277837     3rd Qu.:0.277832         
##  Max.   :0.9567699   Max.   :0.957414     Max.   :0.955968
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features')

df = get_cors(
  data = filter(
    select(val_train_Xy, all_of(num_feats)),
    .data[[x]] != 0
  ),
  x_lst = x_lst,
  feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
##  TotalBsmtFinSF     sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
##  Min.   :0.002897   Min.   :0.0002521    Min.   :0.0187           
##  1st Qu.:0.112971   1st Qu.:0.1000694    1st Qu.:0.1153           
##  Median :0.274222   Median :0.2348650    Median :0.2413           
##  Mean   :0.295891   Mean   :0.2677646    Mean   :0.2713           
##  3rd Qu.:0.388785   3rd Qu.:0.3526043    3rd Qu.:0.3546           
##  Max.   :0.917361   Max.   :0.9644098    Max.   :0.9887
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
  geom_boxplot(notch = T) +
  ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')

y_lst = c('log(SalePrice)')
for (feat in x_lst) {
  plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}

3.41.4 Hard Code

x = 'Win(sqrt(TotalBsmtFinSF))'

min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
print(paste("min_val:", min_val))
## [1] "min_val: 10.1234508028247"
print(paste("max_val:", max_val))
## [1] "max_val: 46.3884140769296"
# Already hard coded above.
val_train_Xy = select(val_train_Xy, -c('Win(TotalBsmtFinSF)'))

gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)

# Continue in EDA_pt2

3.42 Serialize Dataframe for Storage

Back to top.

val_train_Xy$Id = val_train_X$Id

saveRDS(val_train_Xy, 'data/eda_val_train_Xy.rds')
head(readRDS('data/eda_val_train_Xy.rds'))